friends <- read.csv("friends.csv", sep = ",", header = TRUE)
friends_info <- read.csv("friends_info.csv", sep = ",", header = TRUE)
friends_emotions <- read.csv("friends_emotions.csv", sep = ",", header = TRUE)
library(tidyverse)

What is the distribution of lines (measured by the frequency of variable “text”)among the six main characters throughout the 10 seasons? Does any non-main character have more lines than a main character in any episode? (Mike, Richard, Janice)

# Define the main characters
main_characters <- c("Monica Geller", "Joey Tribbiani", "Chandler Bing", 
                     "Phoebe Buffay", "Rachel Green", "Ross Geller")

# Calculate the distribution of lines spoken by each main character
line_distribution <- friends |>
  filter(speaker %in% main_characters) |>  # Filter for main characters
  group_by(speaker) |>                     # Group by character
  summarise(total_lines = n()) |>         # Count the number of lines for each character
  arrange(desc(total_lines))                 # Sort by total lines

# Output the line distribution
cat("Distribution of lines spoken by main characters:\n")
## Distribution of lines spoken by main characters:
print(line_distribution)
## # A tibble: 6 × 2
##   speaker        total_lines
##   <chr>                <int>
## 1 Rachel Green          9312
## 2 Ross Geller           9157
## 3 Chandler Bing         8465
## 4 Monica Geller         8441
## 5 Joey Tribbiani        8171
## 6 Phoebe Buffay         7501
# Create a bar plot for the distribution of lines
ggplot(line_distribution, aes(x = reorder(speaker, total_lines), y = total_lines)) +
  geom_bar(stat = "identity", fill = "skyblue") +
  theme_minimal() +
  labs(
    title = "Distribution of Lines Spoken by Main Characters in Friends",
    x = "Characters",
    y = "Total Lines"
  ) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))  # Rotate x-axis labels for readability

# Define the main characters
main_characters <- c("Monica Geller", "Joey Tribbiani", "Chandler Bing", 
                     "Phoebe Buffay", "Rachel Green", "Ross Geller")

# Calculate the distribution of utterances spoken by each main character
utterance_distribution <- friends |>
  filter(speaker %in% main_characters) |>  # Filter for main characters
  group_by(speaker) |>                     # Group by character
  summarise(total_utterance = sum(utterance, na.rm = TRUE)) |>  # Sum utterances for each character
  arrange(desc(total_utterance))            # Sort by total utterances

# Output the utterance distribution
cat("Distribution of utterances spoken by main characters:\n")
## Distribution of utterances spoken by main characters:
print(utterance_distribution)
## # A tibble: 6 × 2
##   speaker        total_utterance
##   <chr>                    <int>
## 1 Rachel Green            187427
## 2 Ross Geller             182134
## 3 Monica Geller           157199
## 4 Chandler Bing           154488
## 5 Joey Tribbiani          151005
## 6 Phoebe Buffay           132168
# Create a bar plot for the distribution of utterances
ggplot(utterance_distribution, aes(x = reorder(speaker, total_utterance), y = total_utterance)) +
  geom_bar(stat = "identity", fill = "lightgreen") +
  theme_minimal() +
  labs(
    title = "Distribution of Utterances Spoken by Main Characters in Friends",
    x = "Characters",
    y = "Total Utterances"
  ) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))  # Rotate x-axis labels for readability

# Define the main characters
main_characters <- c("Monica Geller", "Joey Tribbiani", "Chandler Bing", 
                     "Phoebe Buffay", "Rachel Green", "Ross Geller")

# Filter for characters that are not main characters and meet the criteria
non_main_characters <- friends |>
  filter(!(speaker %in% main_characters) & 
         !is.na(speaker) & 
         speaker != "#ALL#" &
         speaker != "Scene Directions") |>
  group_by(speaker) |>
  summarise(total_lines = n(), 
            total_utterance = sum(utterance, na.rm = TRUE)) |>
  filter(total_lines > 150 ) |>
  arrange(desc(total_lines))

# Create a bar plot for the distribution of lines and utterances
ggplot(non_main_characters, aes(x = reorder(speaker, total_lines), y = total_lines)) +
  geom_bar(stat = "identity", fill = "lightcoral") +
  theme_minimal() +
  labs(
    title = "Non-Main Characters with most lines in Friends",
    x = "Characters",
    y = "Total Lines"
  ) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))  # Rotate x-axis labels for readability

# Filter the dataset for the three speakers
speaker_distribution <- friends |>
  filter(speaker %in% c("Janice Litman Goralnik", "Mike Hannigan", "Richard Burke")) |>
  group_by(speaker, season) |>
  summarise(Number_of_Lines = n(), .groups = "drop")

# Create bins to ensure seasons 1 to 10 are represented
speaker_distribution <- speaker_distribution |>
  complete(season = 1:10, speaker, fill = list(Number_of_Lines = 0))

# Plot the distribution of lines for the three speakers
ggplot(speaker_distribution, aes(x = factor(season), y = Number_of_Lines, fill = speaker)) +
  geom_bar(stat = "identity", position = "dodge", color = "black") +
  theme_minimal() +
  labs(
    title = "Line Distribution for Selected Characters Across Seasons",
    x = "Season",
    y = "Number of Lines",
    fill = "Speaker"
  ) +
  scale_fill_manual(values = c("skyblue", "orange", "lightgreen")) +  # Custom colors for speakers
  theme(
    plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
    axis.title = element_text(size = 12),
    legend.title = element_text(size = 12),
    legend.text = element_text(size = 10)
  )

# Calculate the distribution of lines spoken by each character (main and non-main), excluding "Scene Directions"
line_distribution <- friends |>
  filter(speaker != "Scene Directions") |>  # Exclude "Scene Directions"
  group_by(season, episode, speaker) |>   # Group by season, episode, and speaker
  summarise(total_lines = n(), .groups = 'drop')  # Count the number of lines for each speaker
# Find the speaker with the most lines in each episode of each season
most_lines_per_episode <- line_distribution |>
  group_by(season, episode) |>
  filter(total_lines == max(total_lines))

# Output the result
print(most_lines_per_episode)
## # A tibble: 244 × 4
## # Groups:   season, episode [236]
##    season episode speaker       total_lines
##     <int>   <int> <chr>               <int>
##  1      1       1 Monica Geller          73
##  2      1       2 Ross Geller            68
##  3      1       3 Monica Geller          52
##  4      1       4 Monica Geller          47
##  5      1       5 Ross Geller            40
##  6      1       6 Chandler Bing          58
##  7      1       7 Ross Geller            53
##  8      1       8 Ross Geller            61
##  9      1       9 Monica Geller          48
## 10      1      10 Phoebe Buffay          51
## # ℹ 234 more rows
# Filter for episodes where the speaker with the most lines is a non-main character
non_main_results <- most_lines_per_episode |>
  filter(!(speaker %in% main_characters))

# Output the result
if (nrow(non_main_results) > 0) {
  print("Episodes where a non-main character has the most lines:")
  print(non_main_results)
} else {
  print("No episodes where a non-main character has the most lines.")
}
## [1] "Episodes where a non-main character has the most lines:"
## # A tibble: 2 × 4
## # Groups:   season, episode [2]
##   season episode speaker      total_lines
##    <int>   <int> <chr>              <int>
## 1      6      21 Paul Stevens          44
## 2      9       8 Amy Green             58

information about Amy Green

amy_green_stats <- friends %>%
  filter(speaker == "Amy Green") %>%  # Filter for Amy Green
  summarise(
    total_lines = n(),                  # Count the number of lines
    total_utterance = sum(utterance, na.rm = TRUE)  # Sum of utterances
  )

# Output the results
cat("Number of lines spoken by Amy Green:\n")
## Number of lines spoken by Amy Green:
print(amy_green_stats$total_lines)
## [1] 123
cat("Total utterances by Amy Green:\n")
## Total utterances by Amy Green:
print(amy_green_stats$total_utterance)
## [1] 9024
# Calculate the total utterances for each speaker and get the top 10
top_utterances <- friends |>
  group_by(speaker) |>                        # Group by speaker
  summarise(total_utterance = sum(utterance, na.rm = TRUE)) |>  # Sum utterances for each character
  arrange(desc(total_utterance)) |>           # Sort by total utterances in descending order
  head(20)                                      # Select the top 10

# Output the top 10 speakers with highest utterances
cat("Top 10 people with highest utterances:\n")
## Top 10 people with highest utterances:
print(top_utterances)
## # A tibble: 20 × 2
##    speaker          total_utterance
##    <chr>                      <int>
##  1 Rachel Green              187427
##  2 Ross Geller               182134
##  3 Monica Geller             157199
##  4 Chandler Bing             154488
##  5 Joey Tribbiani            151005
##  6 Phoebe Buffay             132168
##  7 Scene Directions           76847
##  8 Amy Green                   9024
##  9 #ALL#                       7556
## 10 <NA>                        6301
## 11 Mike Hannigan               4585
## 12 Judy Geller                 4101
## 13 Richard Burke               3852
## 14 Will Colbert                3637
## 15 Jack Geller                 3423
## 16 Frank Buffay Jr.            3334
## 17 Emily Waltham               3119
## 18 Charlie Wheeler             2868
## 19 Eric                        2864
## 20 Tag Jones                   2792
# Filter the dataset for Joey's famous line
joey_lines <- friends |>
  filter(speaker == "Joey Tribbiani" & str_detect(text, "How you")) |>  # Check if text contains the phrase
  group_by(season) |>                                             # Group by season
  summarise(frequency = n()) |>                                 # Count occurrences of the line
  arrange(desc(frequency))                                        # Sort by frequency in descending order

# Output the distribution of frequency
cat("Distribution of frequency of 'How you' said by Joey:\n")
## Distribution of frequency of 'How you' said by Joey:
print(joey_lines)
## # A tibble: 7 × 2
##   season frequency
##    <int>     <int>
## 1      6         5
## 2      4         3
## 3      5         3
## 4      9         3
## 5      1         2
## 6      8         2
## 7      7         1
# Determine which season he said it the most
season_most <- joey_lines |>
  filter(frequency == max(frequency)) |>
  select(season)

cat("Joey said 'How you doin?' the most in Season:", season_most$season, "\n")
## Joey said 'How you doin?' the most in Season: 6
# Create a line plot for the frequency of the line by season
ggplot(joey_lines, aes(x = season, y = frequency)) +
  geom_point(color = "blue", size = 3) +
  geom_line(group = 1, color = "blue") +  # Add lines to connect points
  theme_minimal() +
  labs(
    title = "Frequency of 'How you doin?' Said by Joey",
    x = "Season",
    y = "Frequency"
  ) +
  scale_x_continuous(breaks = 1:10, limits = c(1, 10)) +  # Set x-axis from 1 to 10
  theme(axis.text.x = element_text(angle = 45, hjust = 1))  # Rotate x-axis labels for readability

# Filter for lines where the speaker is Joey
joey_lines <- subset(friends, speaker == "Joey Tribbiani")

# Count the occurrences of "How you doin?" in the 'text' column (case insensitive)
how_you_doin_count <- sum(grepl("How you doin?", joey_lines$text, ignore.case = TRUE))

# Print the result
cat("Joey says 'How you doin?'", how_you_doin_count, "times.\n")
## Joey says 'How you doin?' 25 times.
# Count occurrences of "How you doin?" in each season
how_you_doin_by_season <- aggregate(
  grepl("How you doin?", joey_lines$text, ignore.case = TRUE) ~ joey_lines$season,
  data = joey_lines,
  FUN = sum
)

# Rename columns for clarity
colnames(how_you_doin_by_season) <- c("season", "count")

# Load ggplot2 library for plotting
library(ggplot2)

# Create a line plot with customized x-axis
ggplot(how_you_doin_by_season, aes(x = season, y = count)) +
  geom_line(color = "blue", size = 1) +        # Line with color and thickness
  geom_point(color = "red", size = 3) +        # Points at each data value
  labs(
    title = "Number of Times Joey Says 'How you doin?' by Season",
    x = "Season",
    y = "Count"
  ) +
  scale_x_continuous(breaks = 0:10, limits = c(0, 10)) +  # X-axis from 0 to 10
  theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# Count occurrences of "I love you" for each main character
love_you_count <- friends |>
  filter(speaker %in% main_characters & str_detect(text, "I love you")) |>  # Filter main characters and text
  group_by(speaker) |>                        # Group by speaker
  summarise(total_count = n()) |>            # Count occurrences
  arrange(desc(total_count))                   # Sort by total count in descending order

# Output the total counts for each character
cat("Occurrences of 'I love you' by each main character:\n")
## Occurrences of 'I love you' by each main character:
print(love_you_count)
## # A tibble: 6 × 2
##   speaker        total_count
##   <chr>                <int>
## 1 Monica Geller           45
## 2 Chandler Bing           33
## 3 Ross Geller             31
## 4 Rachel Green            25
## 5 Phoebe Buffay           19
## 6 Joey Tribbiani          11
# Identify the character who said "I love you" the most
most_love = love_you_count |>
  filter(total_count == max(total_count))

cat("The character who said 'I love you' the most is:", most_love$speaker, "with", most_love$total_count, "occurrences.\n")
## The character who said 'I love you' the most is: Monica Geller with 45 occurrences.
# Count occurrences of "I love you" for the main characters by season
love_you_by_season <- friends |>
  filter(speaker %in% main_characters & str_detect(text, "I love you")) |>
  group_by(season, speaker) |>
  summarise(total_count = n(), .groups = "drop") |>
  arrange(season)


# Create a plot for the counts
ggplot(love_you_by_season, aes(x = season, y = total_count, color = speaker)) +
  geom_line(size = 1) +                                  # Add lines for each character
  geom_point(size = 3) +                                 # Add points to the lines
  theme_minimal() +
  labs(
    title = "Occurrences of 'I love you' by six main charactors Through the Seasons",
    x = "Season",
    y = "Total Occurrences"
  ) +
  scale_x_continuous(breaks = 1:10, limits = c(1, 10)) +  # Set x-axis from 1 to 10 (seasons)
  scale_y_continuous(breaks = seq(0, max(love_you_by_season$total_count), by = 1)) +  # Set y-axis as integers
  theme(axis.text.x = element_text(angle = 45, hjust = 1))  # Rotate x-axis labels for readability

friends_info <- friends_info |>
  filter(!(row_number() %in% c(36,37,235, 236)))
fit <- lm(imdb_rating ~ us_views_millions, data = friends_info)
plot(friends_info$us_views_millions, friends_info$imdb_rating)

fit.1 <- lm(us_views_millions ~ imdb_rating, data = friends_info)
plot(friends_info$imdb_rating, friends_info$us_views_millions)

plot(fit)

summary(fit)
## 
## Call:
## lm(formula = imdb_rating ~ us_views_millions, data = friends_info)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.14531 -0.24731 -0.02954  0.21320  1.16609 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       7.691313   0.158578  48.502  < 2e-16 ***
## us_views_millions 0.030419   0.006295   4.832 2.47e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3669 on 230 degrees of freedom
## Multiple R-squared:  0.09217,    Adjusted R-squared:  0.08822 
## F-statistic: 23.35 on 1 and 230 DF,  p-value: 2.467e-06
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
boxcox(fit)

fit.reciprocal <- lm(1 /imdb_rating ~ us_views_millions, data = friends_info)
fit.log <- lm(log(imdb_rating) ~ us_views_millions, data = friends_info)
fit.sqrt <- lm(sqrt(imdb_rating) ~ us_views_millions, data = friends_info)
plot(fit.reciprocal)

plot(fit.log)

plot(fit.sqrt)